library(tidyverse)
library(scales)
theme_set(theme_classic())

employed <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv')

earn <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/earn.csv')

Let’s use k-means clustering to explore [employment by race and gender].

Explore data

employed %>% 
  count(race_gender)
## # A tibble: 6 x 2
##   race_gender                   n
##   <chr>                     <int>
## 1 Asian                      1254
## 2 Black or African American  1386
## 3 Men                        1386
## 4 TOTAL                      1386
## 5 White                      1386
## 6 Women                      1386
employed_tidied <- employed %>% 
  filter(!is.na(employ_n)) %>% 
  group_by(occupation = paste(industry, minor_occupation), 
           race_gender) %>% 
  summarize(n = mean(employ_n)) %>% 
  ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the `.groups` argument.

Get the data ready for k-means

employed_tidied %>% 
  filter(race_gender == "TOTAL")
## # A tibble: 239 x 3
##    occupation                                                race_gender       n
##    <chr>                                                     <chr>         <dbl>
##  1 Agriculture and related Construction and extraction occu… TOTAL        1.22e4
##  2 Agriculture and related Farming, fishing, and forestry o… TOTAL        9.56e5
##  3 Agriculture and related Installation, maintenance, and r… TOTAL        3.23e4
##  4 Agriculture and related Manage-ment, business, and finan… TOTAL        1.01e6
##  5 Agriculture and related Management, business, and financ… TOTAL        1.04e6
##  6 Agriculture and related Office and administrative suppor… TOTAL        8.58e4
##  7 Agriculture and related Production occupations            TOTAL        3.52e4
##  8 Agriculture and related Professional and related occupat… TOTAL        4.92e4
##  9 Agriculture and related Protective service occupations    TOTAL        1.47e4
## 10 Agriculture and related Sales and related occupations     TOTAL        1.57e4
## # … with 229 more rows
employed_demo <- employed_tidied %>% 
  filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>% 
  pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
  janitor::clean_names() %>% 
  left_join(employed_tidied %>% 
              filter(race_gender == "TOTAL") %>% 
              select(-race_gender) %>% 
              rename(total = n)) %>% 
  filter(total > 1e4) %>% 
  mutate(across(c(asian, black_or_african_american, women), ~ . / total), 
         total = log(total), 
         across(is.numeric, ~ as.numeric(scale(.)))) %>% 
  mutate(occupation = snakecase::to_snake_case(occupation))
## Joining, by = "occupation"
## Warning: Predicate functions must be wrapped in `where()`.
## 
##   # Bad
##   data %>% select(is.numeric)
## 
##   # Good
##   data %>% select(where(is.numeric))
## 
## ℹ Please update your code.
## This message is displayed once per session.

We are now ready to explore which occupations are most likely to be together in terms of demographic representations among these categories we have and total.

Implement k-means clustering!

employed_clust <- kmeans(select(employed_demo, -occupation), centers = 3)

summary(employed_clust)
##              Length Class  Mode   
## cluster      211    -none- numeric
## centers       12    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
library(broom)

tidy(employed_clust)
## # A tibble: 3 x 7
##    asian black_or_african_american   women  total  size withinss cluster
##    <dbl>                     <dbl>   <dbl>  <dbl> <int>    <dbl> <fct>  
## 1 -0.842                    -0.728 -0.993  -0.582    63     95.7 1      
## 2  0.722                    -0.208  0.719   0.746    89    233.  2      
## 3 -0.190                     1.09  -0.0237 -0.504    59    116.  3
augment(employed_clust, employed_demo) %>% 
  ggplot(aes(total, women, color = .cluster)) +
  geom_point(alpha = .8)

Choosing K

kclusts <- tibble(k = 1:9) %>% 
  mutate(
    kclust = map(k, ~ kmeans(select(employed_demo, -occupation), .x)), 
    tidied = map(kclust, tidy), 
    glanced = map(kclust, glance), 
    augmented = map(kclust, augment, employed_demo)
  )

kclusts %>% 
  unnest(glanced) %>% 
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = .8) +
  geom_point(size = 2)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
employed_clust <- kmeans(select(employed_demo, -occupation), centers = 5)

p <- augment(employed_clust, employed_demo) %>% 
  ggplot(aes(total, women, color = .cluster, name = occupation)) +
  geom_point(alpha = .8)

ggplotly(p)